75f069ca61a070937f5208098c27da1380294f85,src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java,TokensRegexNERAnnotator,readEntries,#String#String#BufferedReader#Set#boolean#boolean#,315

Before Change


                                         BufferedReader mapping,
                                         Set<String> noDefaultOverwriteLabels,
                                         boolean ignoreCase, boolean verbose) throws IOException {
    List<Entry> entries = new ArrayList<Entry>();
    TrieMap<String,Entry> seenRegexes = new TrieMap<String,Entry>();
    int lineCount = 0;
    for (String line; (line = mapping.readLine()) != null; ) {
      lineCount ++;
      String[] split = line.split("\t");
      if (split.length < 2 || split.length > 4)
        throw new IllegalArgumentException("Provided mapping file is in wrong format");

      String[] regexes = split[0].trim().split("\\s+");
      String[] key = regexes;
      if (ignoreCase) {
        key = new String[regexes.length];
        for (int i = 0; i < regexes.length; i++) {
          key[i] = regexes[i].toLowerCase();
        }
      }
      String type = split[1].trim();

      Set<String> overwritableTypes = Generics.newHashSet();
      double priority = 0.0;

      if (split.length >= 3) {
        overwritableTypes.addAll(Arrays.asList(split[2].trim().split(",")));
      }
      if (split.length == 4) {
        try {
          priority = Double.parseDouble(split[3].trim());
        } catch(NumberFormatException e) {
          throw new IllegalArgumentException("ERROR: Invalid line " + lineCount
                  + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e);
        }
      }

      Entry entry = null;
      if (seenRegexes.containsKey(key)) {
        Entry oldEntry = seenRegexes.get(key);
        if (priority > oldEntry.priority) {
          entry = new Entry(regexes, type, overwritableTypes, priority);
          logger.warn("TokensRegexNERAnnotator " + annotatorName +
                  ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry);
        } else {
          if (!oldEntry.type.equals(type)) {
            if (verbose) {
              logger.warn("TokensRegexNERAnnotator " + annotatorName +
                      ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type);
            }
          }
          continue;
        }
      } else {
        entry = new Entry(regexes, type, overwritableTypes, priority);
      }

      // Print some warning about the type
      int commaPos = entry.type.indexOf(',');
      if (commaPos > 0) {
        // Strip the "," and just take first type
        String newType = entry.type.substring(0, commaPos).trim();
        logger.warn("TokensRegexNERAnnotator " + annotatorName +
                ": Entry has multiple type " + entry + ", taking type to be " + newType);
        entry.type = newType;
      }

      // Print some warning if label belongs to noDefaultOverwriteLabels but there is no overwritable types
      if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) {
        logger.warn("TokensRegexNERAnnotator " + annotatorName +
                ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels");
      }

      entries.add(entry);
      seenRegexes.put(key, entry);
    }

    logger.log("TokensRegexNERAnnotator " + annotatorName +
            ": Read " + entries.size() + " unique entries out of " + lineCount + " from " + mappingFilename);
    // System.err.println(entries);
    return entries;
  }

After Change


                                         BufferedReader mapping,
                                         Set<String> noDefaultOverwriteLabels,
                                         boolean ignoreCase, boolean verbose) throws IOException {
    int origEntriesSize = entries.size();
    int isTokensRegex = 0;
    int lineCount = 0;
    for (String line; (line = mapping.readLine()) != null; ) {
      lineCount ++;
      String[] split = line.split("\t");
      if (split.length < 2 || split.length > 4)
        throw new IllegalArgumentException("Provided mapping file is in wrong format");

      String regex = split[0].trim();
      String tokensRegex = null;
      String[] regexes = null;
      if (regex.startsWith("( ") && regex.endsWith(" )")) {
        // Tokens regex
        tokensRegex = regex;
      } else {
        regexes = regex.split("\\s+");
      }
      String[] key = (regexes != null)? regexes: new String[] { tokensRegex };
      if (ignoreCase) {
        String[] norm = new String[key.length];
        for (int i = 0; i < key.length; i++) {
          norm[i] = key[i].toLowerCase();
        }
        key = norm;
      }
      String type = split[1].trim();

      Set<String> overwritableTypes = Generics.newHashSet();
      double priority = 0.0;

      if (split.length >= 3) {
        overwritableTypes.addAll(Arrays.asList(split[2].trim().split(",")));
      }
      if (split.length == 4) {
        try {
          priority = Double.parseDouble(split[3].trim());
        } catch(NumberFormatException e) {
          throw new IllegalArgumentException("ERROR: Invalid line " + lineCount
                  + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e);
        }
      }

      Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority);
      if (seenRegexes.containsKey(key)) {
        Entry oldEntry = seenRegexes.get(key);
        if (priority > oldEntry.priority) {
          logger.warn("TokensRegexNERAnnotator " + annotatorName +
                  ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry);
        } else {
          if (!oldEntry.type.equals(type)) {
            if (verbose) {
              logger.warn("TokensRegexNERAnnotator " + annotatorName +
                      ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type);
            }
          }
          continue;
        }
      }

      // Print some warning about the type
      int commaPos = entry.type.indexOf(',');
      if (commaPos > 0) {
        // Strip the "," and just take first type
        String newType = entry.type.substring(0, commaPos).trim();
        logger.warn("TokensRegexNERAnnotator " + annotatorName +
                ": Entry has multiple type " + entry + ", taking type to be " + newType);
        entry.type = newType;
      }

      // Print some warning if label belongs to noDefaultOverwriteLabels but there is no overwritable types
      if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) {
        logger.warn("TokensRegexNERAnnotator " + annotatorName +
                ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels");
      }

      entries.add(entry);
      seenRegexes.put(key, entry);
      if (entry.tokensRegex != null) isTokensRegex++;
    }

    logger.log("TokensRegexNERAnnotator " + annotatorName +
            ": Read " + (entries.size() - origEntriesSize) + " unique entries out of " + lineCount + " from " + mappingFilename
       + ", " + isTokensRegex + " TokensRegex patterns.");
    return entries;
  }